In [1]:
#LOAD LIBRARIES
import numpy as np 
import pandas as pd 
import os
import json
import descartes
import fiona
import folium
import datetime
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import re
import geopandas as gpd
from shapely.geometry import Point, Polygon

%matplotlib inline

#if getting error message then perform the following
#conda install -c conda-forge geopandas
#conda install -c conda-forge pyproj
#conda install -c conda-forge rtree=0.9.3
In [2]:
#LOAD DATA
#https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j
df = pd.read_csv('NYC_Restaurant_Inspection_Results.csv') 

#KEEP RELEVANT VARIABLES
df.drop(['INSPECTION TYPE','VIOLATION CODE','VIOLATION DESCRIPTION','PHONE','BIN','BBL','NTA','Community Board','Council District','Census Tract','RECORD DATE'],axis=1,inplace=True)
In [3]:
#DATA CLEANING & FEATURE ENGINEERING 

#DATA CLEANING

#Replace spaces with underscores in column names
df.columns=df.columns.str.replace(" ","_")

#Clean up some of the names
df["DBA"].replace("'","", inplace=True) # Remove apostrophe
df["DBA"].replace(" ?\(.+\)", "", regex=True, inplace=True) # Remove values in parenthesis
df["DBA"].replace(" ?#.*", "", regex=True, inplace=True) # Remove # followed by some string

#Changing dates to datetime
df.INSPECTION_DATE=pd.to_datetime(df.INSPECTION_DATE, format="%m/%d/%Y")
df.GRADE_DATE=pd.to_datetime(df.GRADE_DATE, format="%m/%d/%Y")

#Cleaning some column values for analysis
df.replace(to_replace =["Basque", "Nuts/Confectionary", "Fruits/Vegetables", "Not Listed/Not Applicable", "English", "Bottled beverages, including water, sodas, juices, etc.","Juice, Smoothies, Fruit Salads"],
           value ="Other Rest", inplace=True)
df.replace(to_replace =["Latin American (Central and South America)"],value ="Latin American", inplace=True)
df.replace(to_replace =["Sandwiches/Salads/Mixed Buffet","Soups/Salads/Sandwiches"],value ="Sandwiches", inplace=True)
df.replace(to_replace =["Chinese/Cuban"],value ="Chinese", inplace=True)
df.replace(to_replace =["Hotdogs/Pretzels"],value ="Hotdogs", inplace=True)
df.replace(to_replace =["American"],value ="American Rest", inplace=True)
df.replace(to_replace =["Asian"],value ="Asian Rest", inplace=True)
df.replace(to_replace =["African"],value ="African Rest", inplace=True)
df.replace(to_replace =["Middle Eastern"],value ="Middle Eastern Rest", inplace=True)

#Feature Engineering(Category, Unique Key, Count, Tooltip)
df['category']=df['CUISINE_DESCRIPTION'].apply(lambda x:x.replace('Chinese','Asian').replace('Pizza','Pizza/Sandwich')
                                               .replace('Italian','European').replace('Japanese','Asian').replace('Mexican','Latin American')
                                               .replace('Bakery','Bakery/Desert').replace('Spanish','European').replace('Sandwiches','Pizza/Sandwich')
                                                .replace('Chicken','American').replace('Indian','Asian').replace('Hamburgers','American')
                                               .replace('Donuts','Bakery/Desert').replace('Jewish/Kosher','Other').replace('Korean','Asian')
                                               .replace('Thai','Asian').replace('Juice, Smoothies, Fruit Salads','Other').replace('French','European')
                                               .replace('Mediterranean','African/Middle Eastern').replace('Tex-Mex','Other').replace('Irish','European')
                                               .replace('Ice Cream, Gelato, Yogurt, Ices','Bakery/Desert').replace('Seafood','Other').replace('Bagels/Pretzels','Bakery/Desert')
                                               .replace('Peruvian','Latin American').replace('Sandwiches/Salads/Mixed Buffet','Pizza/Sandwich').replace('Eastern European','European')
                                               .replace('African Rest','African/Middle Eastern').replace('Vietnamese/Cambodian/Malaysia','Asian').replace('Greek','European')
                                               .replace('Steak','Other').replace('Sandwiches','Pizza/Sandwich').replace('Soul Food','Other')
                                               .replace('Barbecue','Other').replace('Salads','Pizza/Sandwich').replace('Russian','European')
                                               .replace('Turkish','African/Middle Eastern').replace('Bangladeshi','Asian').replace('Chinese/Japanese','Asian')
                                               .replace('Tapas','Caribbean').replace('Hawaiian','American').replace('Continental','American')
                                               .replace('Vegan','Other').replace('Creole','Other').replace('Polish','European')
                                               .replace('Brazilian','Latin American').replace('Filipino','Asian').replace('Pakistani','Asian')
                                               .replace('Pancakes/Waffles','Bakery/Desert').replace('Ethiopian','African/Middle Eastern').replace('Moroccan','African/Middle Eastern')
                                               .replace('Egyptian','African/Middle Eastern').replace('Creole/Cajun','Other').replace('German','European')
                                               .replace('Portuguese','European').replace('Soups','Pizza/Sandwich').replace('Afghan','Asian')
                                               .replace('Californian','American').replace('Southwestern','American').replace('Iranian','African/Middle Eastern')
                                               .replace('Czech','European').replace('Chilean','Latin American').replace('Scandinavian','European').replace('Indonesian','Asian')
                                               .replace('American Rest','American').replace('Café/Coffee/Tea','Cafe').replace('Caribbean (Inc. Cuban, Puerto Rican)','Caribbean')
                                               .replace('Cajun','Other').replace('Other/Cajun','Other').replace('Other/Other','Other')
                                               .replace('Asian/Asian','Asian').replace('Middle Eastern Rest','African/Middle Eastern')
                                               .replace('Cakes, Cupcakes, Desserts','Bakery/Desert').replace('Cakes, Cupcakes, Desserts','Bakery/Desert').replace('Vegetarian','Other')
                                              .replace('Asian Rest','Asian').replace('Other Rest','Other').replace('Hotdogs','Other').replace('Australian','Other').replace('Bagels','Bakery/Desert'))

df['KEY'] = df[['DBA', 'BUILDING', 'STREET', 'ZIPCODE']].astype(str).apply(lambda x: ' '.join(x), axis=1)
df['COUNT'] = 1
df['TOOLTIP'] = df['KEY'] + ' ;CUISINE : '+ df['CUISINE_DESCRIPTION'] +' ;SCORE : ' + df['SCORE'].astype(str)
In [4]:
#FILTERING DATA AND PREPARING IT FOR ANALYSIS 

#Keeping only 'Manhattan' data from Jan'16 - Mar'20
nyc_rest=df[(df.INSPECTION_DATE > '2015-12-31') & (df.INSPECTION_DATE < '2020-04-01') & (df.BORO == 'Manhattan')]
#print("num ratings: {} num unique restaurants: {}".format(len(nyc_rest), len(nyc_rest.KEY.unique())))

#Removing Missing Values
nyc_rest = nyc_rest[nyc_rest['GRADE_DATE'].notna()] # Dropping records where Grade_Date are NA
nyc_rest = nyc_rest[nyc_rest['Latitude'].notna()] # Dropping records where Latitude are NA
#print("num ratings: {} num unique restaurants: {}".format(len(nyc_rest), len(nyc_rest.KEY.unique())))

#Identify Latest Inspection Data and Keep Only those records
nyc_rest_int1=nyc_rest.groupby("CAMIS").GRADE_DATE.max()  #fetching latest grade date
nyc_rest_int2= pd.DataFrame(nyc_rest_int1).reset_index()
nyc_rest_int2.CAMIS.value_counts().sort_values()
nyc_rest = pd.merge(nyc_rest,nyc_rest_int2,how='inner',on=['CAMIS','GRADE_DATE'])
#nyc_rest[nyc_rest.CAMIS==50050805]
In [5]:
#Exploratory Data Analysis

print ("Rows     : " ,nyc_rest.shape[0])
print ("Columns  : " ,nyc_rest.shape[1])
print ("\nFeatures : \n" ,nyc_rest.columns.tolist())
print ("\nMissing values :  \n", nyc_rest.isna().sum())
print ("\nUnique values :  \n",nyc_rest.nunique())
Rows     :  22913
Columns  :  19

Features : 
 ['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'CUISINE_DESCRIPTION', 'INSPECTION_DATE', 'ACTION', 'CRITICAL_FLAG', 'SCORE', 'GRADE', 'GRADE_DATE', 'Latitude', 'Longitude', 'category', 'KEY', 'COUNT', 'TOOLTIP']

Missing values :  
 CAMIS                    0
DBA                     15
BORO                     0
BUILDING                 0
STREET                   0
ZIPCODE                476
CUISINE_DESCRIPTION      0
INSPECTION_DATE          0
ACTION                   0
CRITICAL_FLAG          150
SCORE                    0
GRADE                    0
GRADE_DATE               0
Latitude                 0
Longitude                0
category                 0
KEY                      0
COUNT                    0
TOOLTIP                  0
dtype: int64

Unique values :  
 CAMIS                  9922
DBA                    7889
BORO                      1
BUILDING               2140
STREET                  689
ZIPCODE                  81
CUISINE_DESCRIPTION      69
INSPECTION_DATE         440
ACTION                    3
CRITICAL_FLAG             2
SCORE                    61
GRADE                     4
GRADE_DATE              440
Latitude               7905
Longitude              7905
category                 10
KEY                    9861
COUNT                     1
TOOLTIP                9904
dtype: int64
In [6]:
#Question 1 : Identify most hygenic(no violations) and least hygenic restaurants in Manhattan
In [7]:
#A) Restaurants with no violations (Most hygenic)
nyc_rest_best = nyc_rest[(nyc_rest['ACTION']  == 'No violations were recorded at the time of this inspection.')]
nyc_rest_best.reset_index(inplace=True)
nyc_rest_best = nyc_rest_best.drop(columns='index')

#B) Least hygenic restaurants
nyc_rest_worst = nyc_rest[nyc_rest.SCORE > np.percentile(nyc_rest.SCORE,99)]
nyc_rest_worst.reset_index(inplace=True)
nyc_rest_worst = nyc_rest_worst.drop(columns='index')
In [8]:
# Mapping most (green) and least(red) hygenic restaurants on map with tooltips
new_york = folium.Map(
    location=[40.75,-73.92], #[40.7128,-73.9352]
    zoom_start=11.75  
)

for i in range(len(nyc_rest_best)):
    folium.Marker([nyc_rest_best['Latitude'][i], nyc_rest_best['Longitude'][i]],popup=nyc_rest_best['KEY'][i],tooltip=nyc_rest_best['TOOLTIP'][i], icon=folium.Icon(color='green')).add_to(new_york)

for i in range(len(nyc_rest_worst)):
    folium.Marker([nyc_rest_worst['Latitude'][i], nyc_rest_worst['Longitude'][i]],popup=nyc_rest_worst['KEY'][i],tooltip=nyc_rest_worst['TOOLTIP'][i], icon=folium.Icon(color='red')).add_to(new_york)

new_york
Out[8]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [9]:
#Question 2 : Identify best and worst restaurants by cuisine.  See if anything surprises you. 
In [10]:
#Identify Grade A restaurants with no critical violations (best)
nyc_hygenic= nyc_rest[(nyc_rest['GRADE']  == 'A') & (nyc_rest['CRITICAL_FLAG']  == 'N')]
nyc_hygenic.reset_index(inplace=True)
nyc_hygenic = nyc_hygenic.drop(columns='index')
In [11]:
#Identify Grade B and C restaurants with critical violations (worst)
nyc_not_hygenic= nyc_rest[((nyc_rest['GRADE']  == 'B') | (nyc_rest['GRADE']  == 'C')) & (nyc_rest['CRITICAL_FLAG']  == 'Y')]
nyc_not_hygenic.reset_index(inplace=True)
nyc_not_hygenic = nyc_not_hygenic.drop(columns='index')
In [12]:
#Creates the sunburst interactive chart. If you click on the inner circle then you will get details
plt.figure(figsize=(40,40));

df1 = px.data.tips()
fig = px.sunburst(nyc_hygenic, path=['category', 'CUISINE_DESCRIPTION'], values='COUNT',title='Most Hygenic Restaurants (GRADE A)')
fig.update_layout(title_x=0.5)
fig.show()


plt.figure(figsize=(40,40));

df1 = px.data.tips()
fig = px.sunburst(nyc_not_hygenic, path=['category', 'CUISINE_DESCRIPTION'], values='COUNT', title='Non Hygenic Restaurants (GRADE B or C)')
fig.update_layout(title_x=0.5)
fig.show()
<Figure size 2880x2880 with 0 Axes>
<Figure size 2880x2880 with 0 Axes>
In [13]:
# Question 3 : Identify how inspection and grading have changed for Manhattan restaurants over time (2017 - 2019) for different cuisines.
In [14]:
# Inspection details for different cuisines over time 
df_manh=df[(df.INSPECTION_DATE > '2016-12-31') & (df.INSPECTION_DATE < '2020-01-01') & (df.BORO == 'Manhattan')]
df_inspection = df_manh[['CAMIS','INSPECTION_DATE','category','COUNT']]
df_inspection = df_inspection.drop_duplicates()
df_inspection['YEAR'] = pd.DatetimeIndex(df_inspection['INSPECTION_DATE']).year
df_inspection.reset_index(inplace=True)
df_inspection = df_inspection.drop(columns='index')

# Number of inspections by year
df_inspection_yr = df_inspection.groupby(['category','YEAR']).sum()['COUNT'].reset_index()
df_camis_yr = df_inspection.groupby(['category','YEAR']).CAMIS.nunique().reset_index()
df_manh_inspection = pd.merge(df_inspection_yr,df_camis_yr,how='inner',on=['category','YEAR'])
df_manh_inspection['AVG_INSPECTIONS'] = df_manh_inspection['COUNT']/df_manh_inspection['CAMIS']

# Renaming columns
df_manh_inspection = df_manh_inspection.rename(columns={ 'CAMIS': 'INSPECTED RESTAURANTS','COUNT': '# OF INSPECTIONS'})
In [15]:
# Grading details for different cuisines over time
df_graded=df[(df.INSPECTION_DATE > '2016-12-31') & (df.INSPECTION_DATE < '2020-01-01') & (df.BORO == 'Manhattan')]
df_graded = df_graded[df_graded['GRADE_DATE'].notna()] # Dropping records where Grade_Date are NA
df_graded = df_graded[df_graded['Latitude'].notna()] # Dropping records where Latitude are NA
df_graded = df_graded[['CAMIS','GRADE_DATE','category','COUNT']]
df_graded['YEAR'] = pd.DatetimeIndex(df_graded['GRADE_DATE']).year
df_graded.reset_index(inplace=True)
df_graded = df_graded.drop(columns='index')

# Number of violations by year
df_graded_viol_yr = df_graded.groupby(['category','YEAR']).sum()['COUNT'].reset_index()
df_camis_viol_yr = df_graded.groupby(['category','YEAR']).CAMIS.nunique().reset_index()
df_manh_grade_viol = pd.merge(df_graded_viol_yr,df_camis_viol_yr,how='inner',on=['category','YEAR'])
df_manh_grade_viol['AVG_VIOLATIONS'] = df_manh_grade_viol['COUNT']/df_manh_grade_viol['CAMIS']

# Renaming columns
df_manh_grade_viol = df_manh_grade_viol.rename(columns={ 'CAMIS': 'GRADED RESTAURANTS','COUNT': '# OF VIOLATIONS'})
In [18]:
#Joining the above tables for final analysis
df_manh_analysis = pd.merge(df_manh_inspection,df_manh_grade_viol,how='inner',on=['category','YEAR'])
df_manh_analysis.YEAR = df_manh_analysis.YEAR.astype('str')
df_manh_analysis
Out[18]:
category YEAR # OF INSPECTIONS INSPECTED RESTAURANTS AVG_INSPECTIONS # OF VIOLATIONS GRADED RESTAURANTS AVG_VIOLATIONS
0 African/Middle Eastern 2017 300 172 1.744186 421 158 2.664557
1 African/Middle Eastern 2018 456 222 2.054054 668 212 3.150943
2 African/Middle Eastern 2019 526 250 2.104000 760 238 3.193277
3 American 2017 3383 2118 1.597262 4888 1960 2.493878
4 American 2018 4803 2590 1.854440 7254 2497 2.905086
5 American 2019 5201 2848 1.826194 8189 2734 2.995245
6 Asian 2017 1813 965 1.878756 2523 870 2.900000
7 Asian 2018 2960 1289 2.296354 4438 1239 3.581921
8 Asian 2019 3500 1525 2.295082 5287 1450 3.646207
9 Bakery/Desert 2017 594 372 1.596774 881 361 2.440443
10 Bakery/Desert 2018 826 473 1.746300 1253 464 2.700431
11 Bakery/Desert 2019 1029 566 1.818021 1541 549 2.806922
12 Cafe 2017 871 587 1.483816 1228 564 2.177305
13 Cafe 2018 1286 793 1.621690 1779 762 2.334646
14 Cafe 2019 1490 922 1.616052 2223 887 2.506201
15 Caribbean 2017 60 37 1.621622 94 33 2.848485
16 Caribbean 2018 121 49 2.469388 172 48 3.583333
17 Caribbean 2019 132 59 2.237288 159 53 3.000000
18 European 2017 1335 804 1.660448 1955 724 2.700276
19 European 2018 1983 975 2.033846 2914 941 3.096706
20 European 2019 2202 1105 1.992760 3370 1058 3.185255
21 Latin American 2017 591 313 1.888179 826 287 2.878049
22 Latin American 2018 905 397 2.279597 1378 384 3.588542
23 Latin American 2019 977 442 2.210407 1462 417 3.505995
24 Other 2017 781 512 1.525391 1066 479 2.225470
25 Other 2018 1152 647 1.780526 1720 624 2.756410
26 Other 2019 1415 797 1.775408 2133 756 2.821429
27 Pizza/Sandwich 2017 1130 659 1.714719 1602 628 2.550955
28 Pizza/Sandwich 2018 1659 821 2.020706 2416 808 2.990099
29 Pizza/Sandwich 2019 1937 923 2.098592 2897 898 3.226058
In [21]:
#sns.set(rc={'figure.figsize':(11.7,8.27)})
g=sns.FacetGrid(data=df_manh_analysis, col = 'category', margin_titles=True)
g.map(plt.bar,'YEAR','AVG_VIOLATIONS');
g1=sns.FacetGrid(data=df_manh_analysis, col = 'category', margin_titles=True)
g1.map(plt.bar,'YEAR','AVG_INSPECTIONS')
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x1db1f7c8>
In [ ]: